bts = readRDS('bts.rds')
bcg = readRDS('bcg.rds')
bsa = readRDS('bsa.rds')
bsg = readRDS('bsg.rds')
bsr = readRDS('bsr.rds')
bst = readRDS('bst.rds')
btm = readRDS('btm.rds')
timss = readRDS('timss_2015.rds')

Q2

با توجه به این که مقدار pvalue بسیار کم شده است در نتیجه تحصیلات پدر و مادر بر روی نمره های بچه ها تاثیر گذار است که این موضوع به وضوح در نمودار های رسم شده قابل مشاهده است.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(highcharter)
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
av_grade = bsg %>% mutate(grade = (bsmmat01 + bsmmat02 + bsmmat03 + bsmmat04 + bsmmat05)/5)
gra_edu = merge(av_grade %>% select(idstud, idcntry, idschool, grade), bsg %>% select(mother = bsbg07a, father = bsbg07b, idstud, idcntry, idschool), by=c("idstud", "idcntry", "idschool"))
gra_edu = gra_edu[!(is.na(gra_edu$mother)),]
gra_edu = gra_edu[!(is.na(gra_edu$father)),]

good_father = gra_edu %>% filter(father > 4 & father < 9)
bad_father = gra_edu %>% filter(father > 0 & father < 5)

t.test(good_father$grade, bad_father$grade)
## 
##  Welch Two Sample t-test
## 
## data:  good_father$grade and bad_father$grade
## t = 88.314, df = 222970, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  35.54291 37.15635
## sample estimates:
## mean of x mean of y 
##  493.9241  457.5745
ggplot() + geom_density(data = good_father, aes(x = grade, fill = "Educated"), alpha = 0.5) + geom_density(data = bad_father, aes(x = grade, fill = "Uneducated"), alpha = 0.5)  + labs(title = "Father grade density", x = "Grade", y = "Density", fill = "Education Level")

highchart() %>% hc_add_series(density(good_father$grade), type = "area", name = "Educated Father") %>% hc_add_series(density(bad_father$grade), type = "area", name = "Uneducated Father") %>% hc_title(text = "Father grade density") %>% hc_yAxis(title = list(text = "Density")) %>% hc_xAxis(title = list(text = "Grade"))
good_mother = gra_edu %>% filter(mother > 4 & mother < 9)
bad_mother = gra_edu %>% filter(mother > 0 & mother < 5)

t.test(good_mother$grade, bad_mother$grade)
## 
##  Welch Two Sample t-test
## 
## data:  good_mother$grade and bad_mother$grade
## t = 104.51, df = 245500, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  41.41611 42.99924
## sample estimates:
## mean of x mean of y 
##  497.8699  455.6622
ggplot() + geom_density(data = good_mother, aes(x = grade, fill = "Educated"), alpha = 0.5) + geom_density(data = bad_mother, aes(x = grade, fill = "Uneducated"), alpha = 0.5)  + labs(title = "Mother grade density", x = "Grade", y = "Density", fill = "Education Level")

highchart() %>% hc_add_series(density(good_mother$grade), type = "area", name = "Educated Mother") %>% hc_add_series(density(bad_mother$grade), type = "area", name = "Uneducated Mother") %>% hc_title(text = "Mother grade density") %>% hc_yAxis(title = list(text = "Density")) %>% hc_xAxis(title = list(text = "Grade"))

Q3

با توجه به این که مقدار pvalue به دست آمده بسیار کم است در نتیجه میزان امکانات بر نمره بچه ها تاثیر گذار است که این موضوه با توجه به نمودارهای رسم شده به وضوح قابل دیدن است.

facilities = av_grade %>% mutate(total_facility = 22 - bsbg06a - bsbg06b - bsbg06c - bsbg06d - bsbg06e - bsbg06f - bsbg06g - bsbg06h - bsbg06i - bsbg06j - bsbg06k) %>% select(total_facility, grade)
  
facilities = facilities[!(is.na(facilities$total_facility)),]


good_facilities = facilities %>% filter(total_facility > 6 & total_facility < 12)
bad_facilities = facilities %>% filter(total_facility > 0 & total_facility < 7)


t.test(good_facilities$grade, bad_facilities$grade)
## 
##  Welch Two Sample t-test
## 
## data:  good_facilities$grade and bad_facilities$grade
## t = 91.921, df = 124460, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  47.11368 49.16661
## sample estimates:
## mean of x mean of y 
##  493.1770  445.0368
ggplot() + geom_density(data = good_facilities, aes(x = grade, fill = "High"), alpha = 0.5) + geom_density(data = bad_facilities, aes(x = grade, fill = "Low"), alpha = 0.5)  + labs(title = "Facility grade density", x = "Grade", y = "Density", fill = "Facility Level")

highchart() %>% hc_add_series(density(good_facilities$grade), type = "area", name = "High facility") %>% hc_add_series(density(bad_facilities$grade), type = "area", name = "Low facility") %>% hc_title(text = "Facility grade density") %>% hc_yAxis(title = list(text = "Density")) %>% hc_xAxis(title = list(text = "Grade"))                          

Q4

با توجه به این که مقدار pvalue بسیار کوچک است در نتیجه میزان امنیت بر روی نمرات تاثیر گذار است که می توان این موضوع را به راحتی در نمودار ها مشاهده کرد.

safety_in_school = av_grade %>% select(safety = bsbg15b, grade)

safety_in_school = safety_in_school[!(is.na(safety_in_school$safety)),]

safety_group = safety_in_school %>% group_by(safety) %>% summarise(grade = mean(grade))

safety1 = safety_in_school %>% filter(safety == 1)
safety2 = safety_in_school %>% filter(safety == 2)
safety3 = safety_in_school %>% filter(safety == 3)
safety4 = safety_in_school %>% filter(safety == 4)


safety_in_school$safety = factor(safety_in_school$safety, levels = 1:4)
aov_test = aov(grade ~ safety, safety_in_school)
summary(aov_test)
##                 Df    Sum Sq  Mean Sq F value Pr(>F)    
## safety           3 3.928e+07 13093132    1197 <2e-16 ***
## Residuals   274074 2.999e+09    10943                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot() + geom_density(data = safety1, aes(x = grade, fill = "Very safe"), alpha = 0.5) + geom_density(data = safety2, aes(x = grade, fill = "Safe"), alpha = 0.5) + geom_density(data = safety3, aes(x = grade, fill = "Not safe"), alpha = 0.5) + geom_density(data = safety4, aes(x = grade, fill = "Dangerous"), alpha = 0.5) + labs(title = "Safety grade density", x = "Grade", y = "Density", fill = "Safety Level")

highchart() %>% hc_add_series(density(safety1$grade), type = "area", name = "Very safe") %>% hc_add_series(density(safety2$grade), type = "area", name = "Safe") %>% hc_add_series(density(safety3$grade), type = "area", name = "Not safe") %>% hc_add_series(density(safety4$grade), type = "area", name = "Dangerous") %>% hc_title(text = "Safety grade density") %>% hc_yAxis(title = list(text = "Density")) %>% hc_xAxis(title = list(text = "Grade")) 

Q6

با توجه به این که مقدار pvalue بزرگ شده است در نتیجه نمی توان فرض صفر را رد کرد. این موضوع با توجه به نمودار نیز واضح است زیرا میانگین دو دسته دختر ها و پسرها به یکدیگر نزدیک است.

girls_geo = timss %>% filter(content_domain == "Geometry", cognitive_domain == "Applying") %>% select(correct = correct_ratio_per_question_female)
boys_geo = timss %>% filter(content_domain == "Geometry", cognitive_domain == "Applying") %>% select(correct = correct_ratio_per_question_male)

ggplot() + geom_density(data = boys_geo, aes(x = correct, fill = "Boys"), alpha = 0.5) + geom_density(data = girls_geo, aes(x = correct, fill = "Girls"), alpha = 0.5)  + labs(title = "Sex geometry grade density", x = "Correct ratio", y = "Density", fill = "Sex")

highchart() %>% hc_add_series(density(boys_geo$correct), type = "area", name = "Boys") %>% hc_add_series(density(girls_geo$correct), type = "area", name = "Girls") %>% hc_title(text = "Sex geometry grade density") %>% hc_yAxis(title = list(text = "Density")) %>% hc_xAxis(title = list(text = "Correct ratio"))
t.test(boys_geo, girls_geo)
## 
##  Welch Two Sample t-test
## 
## data:  boys_geo and girls_geo
## t = -0.23614, df = 1493.7, p-value = 0.8134
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02430046  0.01907826
## sample estimates:
## mean of x mean of y 
## 0.4086124 0.4112235

Q7

با توجه به این که مقدار pvalue بسیار کوچک شده در نتیجه فرض صفر رد می شود و نمره بچه ها به وضعیت تغذیه آن ها مرتبت است که این مساله در نمودار واضح است.

school_grade = av_grade %>% group_by(idcntry,idschool) %>% summarise(grade = mean(grade))

meals = merge(school_grade %>% select(idcntry, idschool, grade), bcg %>% select(breakfast = bcbg06a, lunch = bcbg06b , idcntry, idschool), by=c("idcntry", "idschool")) %>% mutate(meals_count = breakfast + lunch)

meals = meals[!(is.na(meals$meals_count)),]


ggplot() + geom_density(data = meals, aes(x = grade, fill = meals_count < 4), alpha = 0.5)

meals$meals_count = factor(meals$meals_count, levels = 2:6)

aov_test = aov(grade ~ meals_count, meals)
summary(aov_test)
##               Df   Sum Sq Mean Sq F value Pr(>F)    
## meals_count    4  1710650  427662   63.92 <2e-16 ***
## Residuals   7237 48418895    6690                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Q9

با توجه به این که مقدار pvalue به دست آمده بسیار کوچک است در نتیجه فرض صفر را رد می کنیم. در واقع تعداد غیبت ها بر نمرات بچه ها تاثیر می گذارد که این موضوع به وضوح در نمودارهای کشیده شده مشهود است.

absents = av_grade %>% select(absent = bsbg11, grade)

absent1 = absents %>% filter(absent == 1)
absent2 = absents %>% filter(absent == 2)
absent3 = absents %>% filter(absent == 3)
absent4 = absents %>% filter(absent == 4)


absents$absent = factor(absents$absent, levels = 1:4)
aov_test = aov(grade ~ absent, absents)
summary(aov_test)
##                 Df    Sum Sq  Mean Sq F value Pr(>F)    
## absent           3 2.687e+08 89568437    8687 <2e-16 ***
## Residuals   264981 2.732e+09    10311                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 15145 observations deleted due to missingness
ggplot() + geom_density(data = absent1, aes(x = as.double(grade), fill = "Once a week or more"), alpha = 0.5) + geom_density(data = absent2, aes(x = as.double(grade), fill = "Once every two weeks"), alpha = 0.5) + geom_density(data = absent3, aes(x = as.double(grade), fill = "Once a month"), alpha = 0.5) + geom_density(data = absent4, aes(x = as.double(grade), fill = "Never or almost never"), alpha = 0.5) + labs(title = "Absents grade density", x = "Grade", y = "Density", fill = "Absents Level")

highchart() %>% hc_add_series(density(absent1$grade), type = "area", name = "Once a week or more") %>% hc_add_series(density(absent2$grade), type = "area", name = "Once every two weeks") %>% hc_add_series(density(absent3$grade), type = "area", name = "Once a month") %>% hc_add_series(density(absent4$grade), type = "area", name = "Never or almost never") %>% hc_title(text = "Absents grade density") %>% hc_yAxis(title = list(text = "Density")) %>% hc_xAxis(title = list(text = "Grade")) 

Q10

با توجه به مقدار pvalue به دست آمده. چون این مقدار نسبتا زیاد است در نتیجه نمی توان فرض صفر را رد کرد. با توجه به نمودار هم می توان فهمید که تفاوت چندانی در توزیع نمرات عملی و توصیفی نیست.

iran_app = bsg %>% filter(idcntry == 364) %>% mutate(apply_field = (bssapp01 + bssapp02 + bssapp03 + bssapp04 + bssapp05 + bsmapp01 + bsmapp02 + bsmapp03 + bsmapp04 + bsmapp05)/10, reason_field = (bsmrea01 + bsmrea02 + bsmrea03 + bsmrea04 + bsmrea05 + bssrea01 + bssrea02 + bssrea03 + bssrea04 + bssrea05)/10)

ggplot(iran_app) + geom_density(aes(x = as.double(reason_field), fill = "Reason"), alpha = 0.5) + geom_density(aes(x = as.double(apply_field), fill = "Apply"), alpha = 0.5) + labs(title = "Geometry grade destiny", x = "Grade", y="Destiny", fill = "Geometry")

highchart() %>% hc_add_series(density(iran_app$reason_field), type = "area", name = "Reason Field") %>% hc_add_series(density(iran_app$apply_field), type = "area", name = "Apply Field") %>% hc_title(text = "Geometry grade destiny") %>% hc_yAxis(title = list(text = "Destiny")) %>% hc_xAxis(title = list(text = "Grade"))
t.test(iran_app$reason_field, iran_app$apply_field, alt="greater")
## 
##  Welch Two Sample t-test
## 
## data:  iran_app$reason_field and iran_app$apply_field
## t = -0.40608, df = 12235, p-value = 0.6577
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -3.176549       Inf
## sample estimates:
## mean of x mean of y 
##  451.7581  452.3870

Q11

سه گزاره جالب

در این گزاره تاثیر انگزه بچه ها برای ادامه تحصیل بر روی نمرات آن ها بررسی شده است که با توجه به این که pvalue بسیار کوچک است در نتیجه می توان نتیجه گرفت که مقدار انگیزه بر نمرات تاثیر می گذارد که این موضوع در نمودار مشهود است.

motivations = av_grade %>% select(motivation_level = bsbg08, grade)

motivations = motivations[!(is.na(motivations$motivation_level)),]


motivations$motivation_level = factor(motivations$motivation_level, levels = 1:6)
aov_test = aov(grade ~ motivation_level, motivations)
summary(aov_test)
##                      Df    Sum Sq  Mean Sq F value Pr(>F)    
## motivation_level      5 2.944e+08 58882860    5847 <2e-16 ***
## Residuals        271149 2.730e+09    10070                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot() + geom_density(data = motivations, aes(x = as.double(grade), fill = motivation_level), alpha = 0.5)

در این گزاره تاثیر تهدید شدن بچه ها بر روی نمراتشان را مور بررسی قرار دادیم که با توجه به این که مقدار pvalue بسیار کم بود فرض صفر را رد می کنیم در نتیجه میزان تهدید بر نمرات بچه ها تاثیر می گذارد که این موضوع در نمودار مشهود است.

threatened = av_grade %>% select(threatened_level = bsbg16i, grade)

threatened = threatened[!(is.na(threatened$threatened_level)),]

threatened$threatened_level = factor(threatened$threatened_level, levels = 1:4)
aov_test = aov(grade ~ threatened_level, threatened)
summary(aov_test)
##                      Df    Sum Sq  Mean Sq F value Pr(>F)    
## threatened_level      3 9.830e+07 32765537    3057 <2e-16 ***
## Residuals        268144 2.874e+09    10718                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot() + geom_density(data = threatened, aes(x = as.double(grade), fill = threatened_level), alpha = 0.5)

در این گزاره تاثیر آسیب دیدن بچه ها توسط دیگران بر روی نمراتشان را مور بررسی قرار دادیم که با توجه به این که مقدار pvalue بسیار کم بود فرض صفر را رد می کنیم در نتیجه میزان آسیب دیدن بر نمرات بچه ها تاثیر می گذارد که این موضوع در نمودار مشهود است.

hurt = av_grade %>% select(hurt_level = bsbg16e, grade)

hurt = hurt[!(is.na(hurt$hurt_level)),]

hurt$hurt_level = factor(hurt$hurt_level, levels = 1:4)
aov_test = aov(grade ~ hurt_level, hurt)
summary(aov_test)
##                 Df    Sum Sq  Mean Sq F value Pr(>F)    
## hurt_level       3 6.619e+07 22062880    2035 <2e-16 ***
## Residuals   267812 2.903e+09    10840                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot() + geom_density(data = hurt, aes(x = as.double(grade), fill = hurt_level), alpha = 0.5)